Einlesen der Daten und Darstellung. Da der Datensatz nur 3 Variablen hat, bietet sich 3D Darstellung an. Aber auch andere Darstellungen sind möglich!
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cluster.dat <- read_csv2( "cluster_example.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## New names:Rows: 100 Columns: 4── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (4): ...1, Z1, Z2, Z3
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3D Darstellung
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(
cluster.dat, x = ~Z1, y = ~Z2, z = ~Z3) %>%
add_markers()
EM Algorithmus ist im Paket mclust implementiert. Bei Clusterverfahren muss sich Gedanken zur Wahl der Anzahl an Clustern gemacht werden, z.B. mittels Betrachtung Informationskriterien wie BIC.
# EM Algorithmus
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
##
## map
# Anzahl an Clustern
BIC <- mclustBIC(cluster.dat, G=1:10, model = c("EII", "EEI"))
plot(BIC)
Der in diesem Paket implementierte Algorithmus hat zur Bestimmung der Startwerte hierarchisches Clustern implementiert und entscheidet darüber auch die Wahl der Cluster.
# Cluster - Hierarchiches Clustern zur Initialisierung
# Auswahl Cluster = 2
mc <- Mclust (cluster.dat)
# Ergebnis
summary ( mc )
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 2 components:
##
## log-likelihood n df BIC ICL
## -1540.838 100 23 -3187.595 -3191.677
##
## Clustering table:
## 1 2
## 46 54
Die ersten 6 Zeilen der Matrix mit den Wahrscheinlichkeiten, dass eine Beobachtung entweder zu Cluster 1 oder zu Cluster 2 gehört.
head(mc$z)
## [,1] [,2]
## [1,] 0.9999998 1.506185e-07
## [2,] 0.9998465 1.534586e-04
## [3,] 1.0000000 4.434705e-11
## [4,] 0.9993320 6.680474e-04
## [5,] 0.9993203 6.796786e-04
## [6,] 1.0000000 1.470275e-09
Darstellung mit der Farbe gemäß der Klasse, die die größte Wahrschenlichkeit hat.
plot_ly(
cluster.dat, x = ~Z1, y = ~Z2, z = ~Z3,
color = mc$classification) %>%
add_markers()
# Ausprobieren von 3 Clusterklassen
# eine Klasse ist staerker besetzt als die zwei anderen Klassen
mc.3 <- Mclust (cluster.dat, G=3)
summary(mc.3)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 3 components:
##
## log-likelihood n df BIC ICL
## -1531.627 100 32 -3210.619 -3218.583
##
## Clustering table:
## 1 2 3
## 26 20 54
# 3D grafik mit Farbe der zugeordneten Cluster
plot_ly(
cluster.dat, x = ~Z1, y = ~Z2, z = ~Z3,
color = mc.3$classification) %>%
add_markers()
Weitergehende Infos zum R Paket gRain z.B. in der Publikation https://www.jstatsoft.org/article/view/v046i10
Hinweise zur Installation: https://people.math.aau.dk/~sorenh/software/gR/
Quelle der Beispiele: http://www.di.fc.ul.pt/~jpn/r/bayesnets/bayesnets.html (Abruf 17.12.2020)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(version = "3.18")
BiocManager::install(c("graph", "RBGL", "Rgraphviz"))
install.packages("gRbase", dependencies=TRUE);
install.packages("gRain", dependencies=TRUE);
install.packages("gRim", dependencies=TRUE)
library(gRain)
library(gRbase)
library(gRim)
#library(Rgraphviz)
# the possible values (all nodes are boolean vars)
yn <- c("yes","no")
# specify the CPTs
node.C <- cptable(~ C, values=c(1, 99), levels=yn)
node.T1 <- cptable(~ T1 + C, values=c(9,1,2,8), levels=yn)
node.T2 <- cptable(~ T2 + C, values=c(9,1,2,8), levels=yn)
# create an intermediate representation of the CPTs
plist <- compileCPT(list(node.C, node.T1, node.T2))
plist
## P( C )
## P( T1 | C )
## P( T2 | C )
plist$C
## C
## yes no
## 0.01 0.99
plist$T1
## C
## T1 yes no
## yes 0.9 0.2
## no 0.1 0.8
# create network
bn.cancer <- grain(plist)
summary(bn.cancer)
## Independence network: Compiled: TRUE Propagated: FALSE
## Nodes : chr [1:3] "C" "T1" "T2"
## Number of cliques: 2
## Maximal clique size: 2
## Maximal state space in cliques: 4
# The marginal probability for each variable:
querygrain(bn.cancer, nodes=c("C", "T1", "T2"), type="marginal")
## $C
## C
## yes no
## 0.01 0.99
##
## $T1
## T1
## yes no
## 0.207 0.793
##
## $T2
## T2
## yes no
## 0.207 0.793
# The joint probability P(C,T1):
querygrain(bn.cancer, nodes=c("C","T1"), type="joint")
## T1
## C yes no
## yes 0.009 0.001
## no 0.198 0.792
# P(T1=+ | T2=+):
# 1. add evidence to the net
bn.cancer.1 <- setFinding(bn.cancer, nodes=c("T2"), states=c("yes"))
# 2. query the new net
querygrain(bn.cancer.1, nodes=c("T1"))
## $T1
## T1
## yes no
## 0.2304348 0.7695652
# The probability of this evidence:
# print(getFinding(bn.cancer.1))
# The conditional P(not C | not T1)
bn.cancer.2 <- setFinding(bn.cancer, nodes=c("T1"), states=c("no"))
querygrain(bn.cancer.2, nodes=c("C"))
## $C
## C
## yes no
## 0.001261034 0.998738966
data("cad1")
head(cad1)
## Sex AngPec AMI QWave QWavecode STcode STchange SuffHeartF
## 1 Male None NotCertain No Usable Usable No No
## 2 Male Atypical NotCertain No Usable Usable No No
## 3 Female None Definite No Usable Usable No No
## 4 Male None NotCertain No Usable Nonusable No No
## 5 Male None NotCertain No Usable Nonusable No No
## 6 Male None NotCertain No Usable Nonusable No No
## Hypertrophi Hyperchol Smoker Inherit Heartfail CAD
## 1 No No No No No No
## 2 No No No No No No
## 3 No No No No No No
## 4 No No No No No No
## 5 No No No No No No
## 6 No No No No No No
# create the DAG
dag.cad <- dag(~ CAD:Smoker:Inherit:Hyperchol +
AngPec:CAD +
Heartfail:CAD +
QWave:CAD)
plot(dag.cad)
# smooth is a small positive number to avoid zero entries in the CPTs
# (cf. Additive smoothing, http://en.wikipedia.org/wiki/Additive_smoothing)
bn.cad <- grain(dag.cad, data = cad1, smooth = 0.1)
# Let's ask some questions...
querygrain(bn.cad, nodes=c("CAD", "Smoker"), type="conditional")
## Smoker
## CAD No Yes
## No 0.7172084 0.4933771
## Yes 0.2827916 0.5066229